"""
merge node degree distance data together between 1981 and 2015, output from inventor_deg_dist.py

sample format:
patent_id_x	patent_year_x	patent_id_y	node_dist_first	node_dist_all
0	4794652	1989	4100619	NaN	0.0
1	4794652	1989	4581776	NaN	NaN
2	4794652	1989	4766615	NaN	NaN
3	4794652	1989	4209858	NaN	NaN
4	4794652	1989	4038700	NaN	NaN

Author: Shaoyu Liu
Date: 2023-03-29

example use for year 2009:

#!/bin/bash
#SBATCH -n 1
#SBATCH --time=24:00:00
#SBATCH --mem-per-cpu=30000
python3 code/gender/inventor_deg_dist.py \
--path_inventor data/patent/g_inventor_disambiguated.tsv.zip \
--path_patent data/patent/g_patent.tsv.zip \
--year 2009 \
--output_dir data/patent/gender_interim/social_network/deg_distance/

"""

import pandas as pd
import os
import glob

os.chdir(r'/Volumes/Zihao_SSD2/PatentsView/')

files = glob.glob("network_results/*.csv")
df = pd.concat([pd.read_csv(file) for file in files], ignore_index=True)

df["patent_id_x"] = df["patent_id_x"].astype("int32")
df["patent_id_y"] = df["patent_id_y"].astype("int32")
df["patent_year_x"] = df["patent_year_x"].astype("int16")

print('Exporting dataset...')
df.to_csv("cleandata/shortest_path_sum_1981_2015.csv", index=False)
